knitr::opts_chunk$set(cache=FALSE, message=FALSE, warning=FALSE, prompt=FALSE,      tidy=TRUE, comment=NA)

Introdução

No serviço público é comum que alguns servidores não aposentam na data em que começa a ter o direito a aposentadoria e permanecem no trabalho até o tempo da idade compulsória recebendo um abono. Dessa forma, o objetivo do estudo consiste na exploração da teoria de análise de regressão logística para delinear o perfil dos servidores que adquirem o direito ao abono salarial e decidem se continuam trabalhando = 0 ou não = 1, situação que será a variável resposta de interesse neste caso.

Base de Dados

Para subsidiar este estudo, foram coletadas informações do Data Warehouse do Sistema Integrado de Administração de Pessoal - DW SIAPE dos servidores públicos do Ministério da Economia - ME, ativos e inativos no período de janeiro de 2019 a julho de 2019.

As informações extraídas foram:

• Nome • Sexo • CPF (chave primária) • Matricula SIAPE • CLASSE_CARGO • PADRAO_CARGO • SITUACAO_VINCULO • REGIME_JURIDICO • JORNADA_DE_TRABALHO • Remuneração • Ano/Mês inicial do abono de permanência • Valor Abono (caso tenha) • Todas as gratificações e funções comissionadas • Funções – VPNI • Quantidade de dependentes • Descrição do cargo emprego • Nível de Escolaridade • Denominação do órgão de atuação • UF da UPAG de vinculação • Denominação unidade organizacional • ORGSUP_EXERCICIO • UF da Residência • Cidade da residência • Situação servidor • Tipo DE Aposentadoria • Data ingresso no serviço público • Data de nascimento • DATA ocorrência inatividade

Descritivas dos dados

Inicialmente, selecionou-se o último mês desta base de dados, junho de 2019, e buscou os tempos iniciais para o estudo, ou seja, a data de início do abono e, para os casos da ocorrência do evento de aposentadorias, buscou-se a data deste fato.

mes0619 <- read.csv2("dados.csv")
mes0619 %>% DT::datatable()
summary(mes0619[, -c(1, 2)])
 SEXO                                          CARGO         JORNADA     
 F:9937   AUDITOR-FISCAL DA RECEITA FEDERAL BRASIL:5357   Min.   :20.00  
 M:8042   AGENTE ADMINISTRATIVO                   :3217   1st Qu.:40.00  
          ANALISTA TRIBUTARIO REC FEDERAL BRASIL  :2149   Median :40.00  
          AUDITOR FISCAL DO TRABALHO              :1335   Mean   :39.93  
          TECNICO DO SEGURO SOCIAL                :1167   3rd Qu.:40.00  
          AGENTE DE PORTARIA                      : 666   Max.   :40.00  
          (Other)                                 :4088                  
         Tipo_Regiao             id_ini_serv_pub  Idade_inicio_Serv
 Cidade_Grande :9122   entre 20 a 30 anos:10189   Min.   :11.88    
 Cidade_Média  :7364   entre 30 e 40 anos: 5214   1st Qu.:23.21    
 Cidade_Pequena:1493   mais de 40 anos   : 1378   Median :27.36    
                       menos de 20 anos  : 1198   Mean   :28.72    
                                                  3rd Qu.:32.98    
                                                  Max.   :62.03    
                                                                   
             id_aposentad  anos_de_aposent  Idade_Aposentadoria
 entre 50 a 60 anos:4704   Min.   : 0.093   Min.   :48.57      
 entre 60 e 70 anos:4328   1st Qu.: 1.619   1st Qu.:56.77      
 mais de 70 anos   : 472   Median : 4.636   Median :60.06      
 menos de 50 anos  :   5   Mean   : 4.901   Mean   :60.56      
 NA's              :8470   3rd Qu.: 7.463   3rd Qu.:63.92      
                           Max.   :15.126   Max.   :75.04      
                           NA's   :8470     NA's   :8470       
       D               R             liquido            desc_per      
 Min.   :    0   Min.   :  2895   Min.   :   302.4   Min.   :0.00000  
 1st Qu.: 1580   1st Qu.:  7917   1st Qu.:  6673.3   1st Qu.:0.07302  
 Median : 2676   Median : 20449   Median : 16356.8   Median :0.15176  
 Mean   : 3593   Mean   : 23833   Mean   : 20240.2   Mean   :0.17531  
 3rd Qu.: 4556   3rd Qu.: 42254   3rd Qu.: 34489.4   3rd Qu.:0.24943  
 Max.   :35071   Max.   :132037   Max.   :112528.6   Max.   :0.97570  
                                                                      
        Desc_Renda_Bruta     Qtde_dependentes     tempo         
 até 20%        :11157   até dois    : 6884   Min.   : 0.00274  
 entre 20% a 30%: 3743   dois ou mais:  543   1st Qu.: 2.02877  
 entre 30% a 40%: 2529   Nenhum      :10552   Median : 4.49589  
 mais de 40%    :  550                        Mean   : 5.40113  
                                              3rd Qu.: 7.83425  
                                              Max.   :20.61370  
                                                                
     status      
 Min.   :0.0000  
 1st Qu.:0.0000  
 Median :1.0000  
 Mean   :0.5289  
 3rd Qu.:1.0000  
 Max.   :1.0000  
                 

Exploração dos Dados de Sobrevivência - Estimador Kaplan-Meier

A partir do Estimador de Kaplan-Meier é possível observar o tempo mediano de 7.66 anos que os servidores em abono salarial permanecem no trabalho até decidirem aposentarem.

require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = factor("At risk")))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = "right") ~ 
    z, .df)
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, nevent = .fit$n.event, 
    ncensor = .fit$n.censor, upper = .fit$upper, lower = .fit$lower)
.df <- .df[!duplicated(.df[, c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.med <- plyr::ddply(.fit, plyr::.(z), function(x) {
    data.frame(median = min(subset(x, y < (0.5 + .Machine$double.eps^0.5))$x))
})
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = NA, 
    ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) max(d$nrisk, 
    na.rm = TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {
    .df <- subset(.fit, x < 7 * i)
    .tmp2 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) if (all(is.na(d$nrisk))) 
        NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = TRUE))))
    .tmp2$x <- 7 * i
    .tmp2$Freq[is.na(.tmp2$Freq)] <- .tmp1$Freq[is.na(.tmp2$Freq)]
    .tmp1 <- .tmp2
    .nrisk <- rbind(.nrisk, .tmp2)
}
.nrisk$y <- rep(seq(0.025, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + geom_step(data = subset(.fit, 
    !is.na(upper)), aes(y = upper), size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, 
    na.rm = FALSE) + geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), 
    size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + geom_step(size = 1.5) + 
    geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size = 1.5) + 
    geom_vline(data = .med, aes(xintercept = median), colour = "black", lty = 2) + 
    scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 
    1), expand = c(0.01, 0)) + scale_colour_brewer(palette = "Set1") + xlab("Tempo em anos de abono") + 
    ylab("Probabilidade S(t)") + labs(title = "Estimador de Kaplan-Meier") + 
    theme_grey(base_size = 14, base_family = "sans") + theme(legend.position = "none")
.nrisk$y <- 0.5
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = z)) + 
    geom_text(size = 14 * 0.282, family = "sans") + scale_x_continuous(breaks = seq(0, 
    21, by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 1)) + 
    scale_colour_brewer(palette = "Set1") + ylab("Tempo em anos de abono") + 
    RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z, 
    colour = z)) + geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
    scale_x_continuous(limits = c(-5, 5)) + scale_y_continuous(limits = c(0, 
    1)) + scale_colour_brewer(palette = "Set1") + RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 
    14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + geom_blank() + RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 
    14, "sans")
grid::grid.newpage()
grid::pushViewport(grid::viewport(layout = grid::grid.layout(2, 2, heights = unit(c(1, 
    2.5), c("null", "lines")), widths = unit(c(4, 1), c("lines", "null")))))
print(.plotb, vp = grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2))
print(.plot, vp = grid::viewport(layout.pos.row = 1, layout.pos.col = 1:2))
print(.plot2, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1:2))
print(.plot3, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1))

.plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pmed, .med, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, .plot3, 
    .plotb)
require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = mes0619$SEXO))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = "right") ~ 
    z, .df)
.pval <- plyr::ddply(.df, plyr::.(), function(x) {
    data.frame(x = 0, y = 0, df = 1, chisq = survival::survdiff(survival::Surv(time = x, 
        event = y, type = "right") ~ z, x)$chisq)
})
.pval$label <- paste0("paste(italic(p), \" = ", signif(1 - pchisq(.pval$chisq, 
    .pval$df), 3), "\")")
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, nevent = .fit$n.event, 
    ncensor = .fit$n.censor, upper = .fit$upper, lower = .fit$lower)
.df <- .df[!duplicated(.df[, c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = NA, 
    ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) max(d$nrisk, 
    na.rm = TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {
    .df <- subset(.fit, x < 7 * i)
    .tmp2 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) if (all(is.na(d$nrisk))) 
        NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = TRUE))))
    .tmp2$x <- 7 * i
    .tmp2$Freq[is.na(.tmp2$Freq)] <- .tmp1$Freq[is.na(.tmp2$Freq)]
    .tmp1 <- .tmp2
    .nrisk <- rbind(.nrisk, .tmp2)
}
.nrisk$y <- rep(seq(0.075, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + geom_step(data = subset(.fit, 
    !is.na(upper)), aes(y = upper), size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, 
    na.rm = FALSE) + geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), 
    size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + geom_step(size = 1.5) + 
    geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size = 1.5) + 
    geom_text(data = .pval, aes(y = y, x = x, label = label), colour = "black", 
        hjust = 0, vjust = -0.5, parse = TRUE, show.legend = FALSE, size = 14 * 
            0.282, family = "sans") + scale_x_continuous(breaks = seq(0, 21, 
    by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 1), expand = c(0.01, 
    0)) + scale_colour_brewer(palette = "Set1") + xlab("Tempo em anos de abono") + 
    ylab("Probabidade S(t)") + labs(colour = "SEXO") + labs(title = "Estimador de Kaplan-Meier") + 
    theme_grey(base_size = 14, base_family = "sans") + theme(legend.position = c(1, 
    1), legend.justification = c(1, 1), legend.background = element_rect(fill = "transparent"))
.nrisk$y <- ((.nrisk$y - 0.025)/(max(.nrisk$y) - 0.025) + 0.5) * 0.5
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = z)) + 
    geom_text(size = 14 * 0.282, family = "sans") + scale_x_continuous(breaks = seq(0, 
    21, by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 1)) + 
    scale_colour_brewer(palette = "Set1") + ylab("Probabidade S(t)") + RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 
    14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z, 
    colour = z)) + geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
    scale_x_continuous(limits = c(-5, 5)) + scale_y_continuous(limits = c(0, 
    1)) + scale_colour_brewer(palette = "Set1") + RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 
    14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + geom_blank() + RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 
    14, "sans")
grid::grid.newpage()
grid::pushViewport(grid::viewport(layout = grid::grid.layout(2, 2, heights = unit(c(1, 
    3), c("null", "lines")), widths = unit(c(4, 1), c("lines", "null")))))
print(.plotb, vp = grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2))
print(.plot, vp = grid::viewport(layout.pos.row = 1, layout.pos.col = 1:2))
print(.plot2, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1:2))
print(.plot3, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1))

.plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pval, .ppval, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, .plot3, 
    .plotb)
.Survfit <- survfit(Surv(tempo, status, type = "right") ~ SEXO, conf.type = "log", 
    conf.int = 0.95, type = "kaplan-meier", error = "greenwood", data = mes0619)
.Survfit
Call: survfit(formula = Surv(tempo, status, type = "right") ~ SEXO, 
    data = mes0619, error = "greenwood", conf.type = "log", conf.int = 0.95, 
    type = "kaplan-meier")

          n events median 0.95LCL 0.95UCL
SEXO=F 9937   5256   7.66    7.42    7.95
SEXO=M 8042   4253   6.67    6.46    6.94
quantile(.Survfit, quantiles = c(0.25, 0.5, 0.75))
$quantile
             25       50       75
SEXO=F 3.463014 7.657534 14.91233
SEXO=M 2.906849 6.673973 11.42192

$lower
             25       50       75
SEXO=F 3.317808 7.419178 14.45479
SEXO=M 2.772603 6.460274 11.11507

$upper
             25       50       75
SEXO=F 3.621918 7.947945 15.20274
SEXO=M 3.021918 6.936986 11.68493
remove(.Survfit)
require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = mes0619$id_ini_serv_pub))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = "right") ~ 
    z, .df)
.pval <- plyr::ddply(.df, plyr::.(), function(x) {
    data.frame(x = 0, y = 0, df = 3, chisq = survival::survdiff(survival::Surv(time = x, 
        event = y, type = "right") ~ z, x)$chisq)
})
.pval$label <- paste0("paste(italic(p), \" = ", signif(1 - pchisq(.pval$chisq, 
    .pval$df), 3), "\")")
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, nevent = .fit$n.event, 
    ncensor = .fit$n.censor, upper = .fit$upper, lower = .fit$lower)
.df <- .df[!duplicated(.df[, c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = NA, 
    ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) max(d$nrisk, 
    na.rm = TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {
    .df <- subset(.fit, x < 7 * i)
    .tmp2 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) if (all(is.na(d$nrisk))) 
        NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = TRUE))))
    .tmp2$x <- 7 * i
    .tmp2$Freq[is.na(.tmp2$Freq)] <- .tmp1$Freq[is.na(.tmp2$Freq)]
    .tmp1 <- .tmp2
    .nrisk <- rbind(.nrisk, .tmp2)
}
.nrisk$y <- rep(seq(0.175, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + geom_step(data = subset(.fit, 
    !is.na(upper)), aes(y = upper), size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, 
    na.rm = FALSE) + geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), 
    size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + geom_step(size = 1.5) + 
    geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size = 1.5) + 
    geom_text(data = .pval, aes(y = y, x = x, label = label), colour = "black", 
        hjust = 0, vjust = -0.5, parse = TRUE, show.legend = FALSE, size = 14 * 
            0.282, family = "sans") + scale_x_continuous(breaks = seq(0, 21, 
    by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 1), expand = c(0.01, 
    0)) + scale_colour_brewer(palette = "Set1") + xlab("Tempo em anos de abono") + 
    ylab("Probabilidade de S(t)") + labs(colour = "id_ini_serv_pub") + labs(title = "Estimador de Kaplan-Meier") + 
    theme_grey(base_size = 14, base_family = "sans") + theme(legend.position = c(1, 
    1), legend.justification = c(1, 1), legend.background = element_rect(fill = "transparent"))
.nrisk$y <- ((.nrisk$y - 0.025)/(max(.nrisk$y) - 0.025) + 0.125) * 0.8
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = z)) + 
    geom_text(size = 14 * 0.282, family = "sans") + scale_x_continuous(breaks = seq(0, 
    21, by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 1)) + 
    scale_colour_brewer(palette = "Set1") + ylab("Probabilidade de S(t)") + 
    RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z, 
    colour = z)) + geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
    scale_x_continuous(limits = c(-5, 5)) + scale_y_continuous(limits = c(0, 
    1)) + scale_colour_brewer(palette = "Set1") + RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 
    14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + geom_blank() + RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 
    14, "sans")
grid::grid.newpage()
grid::pushViewport(grid::viewport(layout = grid::grid.layout(2, 2, heights = unit(c(1, 
    4), c("null", "lines")), widths = unit(c(4, 1), c("lines", "null")))))
print(.plotb, vp = grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2))
print(.plot, vp = grid::viewport(layout.pos.row = 1, layout.pos.col = 1:2))
print(.plot2, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1:2))
print(.plot3, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1))

.plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pval, .ppval, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, .plot3, 
    .plotb)
.Survfit <- survfit(Surv(tempo, status, type = "right") ~ id_ini_serv_pub, conf.type = "log", 
    conf.int = 0.95, type = "kaplan-meier", error = "greenwood", data = mes0619)
.Survfit
Call: survfit(formula = Surv(tempo, status, type = "right") ~ id_ini_serv_pub, 
    data = mes0619, error = "greenwood", conf.type = "log", conf.int = 0.95, 
    type = "kaplan-meier")

                                       n events median 0.95LCL 0.95UCL
id_ini_serv_pub=entre 20 a 30 anos 10189   4878   7.93    7.65    8.17
id_ini_serv_pub=entre 30 e 40 anos  5214   3272   6.39    6.23    6.61
id_ini_serv_pub=mais de 40 anos     1378    890   6.65    6.29    7.14
id_ini_serv_pub=menos de 20 anos    1198    469   7.78    7.10    8.70
quantile(.Survfit, quantiles = c(0.25, 0.5, 0.75))
$quantile
                                         25       50       75
id_ini_serv_pub=entre 20 a 30 anos 3.413699 7.926027 15.09589
id_ini_serv_pub=entre 30 e 40 anos 2.802740 6.389041 11.56712
id_ini_serv_pub=mais de 40 anos    2.975342 6.652055 10.04110
id_ini_serv_pub=menos de 20 anos   3.358904 7.778082 15.40000

$lower
                                         25       50        75
id_ini_serv_pub=entre 20 a 30 anos 3.249315 7.646575 14.731507
id_ini_serv_pub=entre 30 e 40 anos 2.597260 6.232877 11.213699
id_ini_serv_pub=mais de 40 anos    2.778082 6.290411  9.536986
id_ini_serv_pub=menos de 20 anos   3.128767 7.104110 13.942466

$upper
                                         25       50       75
id_ini_serv_pub=entre 20 a 30 anos 3.556164 8.172603 15.32055
id_ini_serv_pub=entre 30 e 40 anos 2.978082 6.605479 12.20274
id_ini_serv_pub=mais de 40 anos    3.383562 7.139726 10.72055
id_ini_serv_pub=menos de 20 anos   3.810959 8.704110       NA
remove(.Survfit)
require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = mes0619$Tipo_Regiao))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = "right") ~ 
    z, .df)
.pval <- plyr::ddply(.df, plyr::.(), function(x) {
    data.frame(x = 0, y = 0, df = 2, chisq = survival::survdiff(survival::Surv(time = x, 
        event = y, type = "right") ~ z, x)$chisq)
})
.pval$label <- paste0("paste(italic(p), \" = ", signif(1 - pchisq(.pval$chisq, 
    .pval$df), 3), "\")")
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, nevent = .fit$n.event, 
    ncensor = .fit$n.censor, upper = .fit$upper, lower = .fit$lower)
.df <- .df[!duplicated(.df[, c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = NA, 
    ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) max(d$nrisk, 
    na.rm = TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {
    .df <- subset(.fit, x < 7 * i)
    .tmp2 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) if (all(is.na(d$nrisk))) 
        NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = TRUE))))
    .tmp2$x <- 7 * i
    .tmp2$Freq[is.na(.tmp2$Freq)] <- .tmp1$Freq[is.na(.tmp2$Freq)]
    .tmp1 <- .tmp2
    .nrisk <- rbind(.nrisk, .tmp2)
}
.nrisk$y <- rep(seq(0.125, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + geom_step(data = subset(.fit, 
    !is.na(upper)), aes(y = upper), size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, 
    na.rm = FALSE) + geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), 
    size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + geom_step(size = 1.5) + 
    geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size = 1.5) + 
    geom_text(data = .pval, aes(y = y, x = x, label = label), colour = "black", 
        hjust = 0, vjust = -0.5, parse = TRUE, show.legend = FALSE, size = 14 * 
            0.282, family = "sans") + scale_x_continuous(breaks = seq(0, 21, 
    by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 1), expand = c(0.01, 
    0)) + scale_colour_brewer(palette = "Set1") + xlab("Tempo em anos de abono") + 
    ylab("Propabilidade de S(t)") + labs(colour = "Tipo_Regiao") + labs(title = "Estimador de Kaplan-Meier") + 
    theme_grey(base_size = 14, base_family = "sans") + theme(legend.position = c(1, 
    1), legend.justification = c(1, 1), legend.background = element_rect(fill = "transparent"))
.nrisk$y <- ((.nrisk$y - 0.025)/(max(.nrisk$y) - 0.025) + 0.125) * 0.8
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = z)) + 
    geom_text(size = 14 * 0.282, family = "sans") + scale_x_continuous(breaks = seq(0, 
    21, by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 1)) + 
    scale_colour_brewer(palette = "Set1") + ylab("Propabilidade de S(t)") + 
    RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z, 
    colour = z)) + geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
    scale_x_continuous(limits = c(-5, 5)) + scale_y_continuous(limits = c(0, 
    1)) + scale_colour_brewer(palette = "Set1") + RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 
    14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + geom_blank() + RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 
    14, "sans")
grid::grid.newpage()
grid::pushViewport(grid::viewport(layout = grid::grid.layout(2, 2, heights = unit(c(1, 
    3.5), c("null", "lines")), widths = unit(c(4, 1), c("lines", "null")))))
print(.plotb, vp = grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2))
print(.plot, vp = grid::viewport(layout.pos.row = 1, layout.pos.col = 1:2))
print(.plot2, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1:2))
print(.plot3, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1))

.plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pval, .ppval, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, .plot3, 
    .plotb)
.Survfit <- survfit(Surv(tempo, status, type = "right") ~ Tipo_Regiao, conf.type = "log", 
    conf.int = 0.95, type = "kaplan-meier", error = "greenwood", data = mes0619)
.Survfit
Call: survfit(formula = Surv(tempo, status, type = "right") ~ Tipo_Regiao, 
    data = mes0619, error = "greenwood", conf.type = "log", conf.int = 0.95, 
    type = "kaplan-meier")

                              n events median 0.95LCL 0.95UCL
Tipo_Regiao=Cidade_Grande  9122   4661   7.84    7.60    8.10
Tipo_Regiao=Cidade_Média   7364   4036   6.58    6.39    6.81
Tipo_Regiao=Cidade_Pequena 1493    812   7.13    6.58    7.45
quantile(.Survfit, quantiles = c(0.25, 0.5, 0.75))
$quantile
                                 25       50       75
Tipo_Regiao=Cidade_Grande  3.572603 7.838356 14.24384
Tipo_Regiao=Cidade_Média   2.865753 6.575342 12.30411
Tipo_Regiao=Cidade_Pequena 3.139726 7.128767 12.42740

$lower
                                 25       50       75
Tipo_Regiao=Cidade_Grande  3.405479 7.597260 13.69589
Tipo_Regiao=Cidade_Média   2.736986 6.386301 11.56712
Tipo_Regiao=Cidade_Pequena 2.805479 6.578082 11.27945

$upper
                                 25       50       75
Tipo_Regiao=Cidade_Grande  3.726027 8.104110 14.75890
Tipo_Regiao=Cidade_Média   2.991781 6.810959 13.19452
Tipo_Regiao=Cidade_Pequena 3.454795 7.449315 14.22466
remove(.Survfit)
require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = mes0619$Desc_Renda_Bruta))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = "right") ~ 
    z, .df)
.pval <- plyr::ddply(.df, plyr::.(), function(x) {
    data.frame(x = 0, y = 0, df = 3, chisq = survival::survdiff(survival::Surv(time = x, 
        event = y, type = "right") ~ z, x)$chisq)
})
.pval$label <- paste0("paste(italic(p), \" = ", signif(1 - pchisq(.pval$chisq, 
    .pval$df), 3), "\")")
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, nevent = .fit$n.event, 
    ncensor = .fit$n.censor, upper = .fit$upper, lower = .fit$lower)
.df <- .df[!duplicated(.df[, c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = NA, 
    ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) max(d$nrisk, 
    na.rm = TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {
    .df <- subset(.fit, x < 7 * i)
    .tmp2 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) if (all(is.na(d$nrisk))) 
        NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = TRUE))))
    .tmp2$x <- 7 * i
    .tmp2$Freq[is.na(.tmp2$Freq)] <- .tmp1$Freq[is.na(.tmp2$Freq)]
    .tmp1 <- .tmp2
    .nrisk <- rbind(.nrisk, .tmp2)
}
.nrisk$y <- rep(seq(0.175, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + geom_step(data = subset(.fit, 
    !is.na(upper)), aes(y = upper), size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, 
    na.rm = FALSE) + geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), 
    size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + geom_step(size = 1.5) + 
    geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size = 1.5) + 
    geom_text(data = .pval, aes(y = y, x = x, label = label), colour = "black", 
        hjust = 0, vjust = -0.5, parse = TRUE, show.legend = FALSE, size = 14 * 
            0.282, family = "sans") + scale_x_continuous(breaks = seq(0, 21, 
    by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 1), expand = c(0.01, 
    0)) + scale_colour_brewer(palette = "Set1") + xlab("Time from entry") + 
    ylab("Proportion of survival") + labs(colour = "Desc_Renda_Bruta") + theme_grey(base_size = 14, 
    base_family = "sans") + theme(legend.position = c(1, 1), legend.justification = c(1, 
    1), legend.background = element_rect(fill = "transparent"))
.nrisk$y <- ((.nrisk$y - 0.025)/(max(.nrisk$y) - 0.025) + 0.125) * 0.8
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = z)) + 
    geom_text(size = 14 * 0.282, family = "sans") + scale_x_continuous(breaks = seq(0, 
    21, by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 1)) + 
    scale_colour_brewer(palette = "Set1") + ylab("Proportion of survival") + 
    RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z, 
    colour = z)) + geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
    scale_x_continuous(limits = c(-5, 5)) + scale_y_continuous(limits = c(0, 
    1)) + scale_colour_brewer(palette = "Set1") + RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 
    14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + geom_blank() + RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 
    14, "sans")
grid::grid.newpage()
grid::pushViewport(grid::viewport(layout = grid::grid.layout(2, 2, heights = unit(c(1, 
    4), c("null", "lines")), widths = unit(c(4, 1), c("lines", "null")))))
print(.plotb, vp = grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2))
print(.plot, vp = grid::viewport(layout.pos.row = 1, layout.pos.col = 1:2))
print(.plot2, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1:2))
print(.plot3, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1))

.plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pval, .ppval, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, .plot3, 
    .plotb)
.Survfit <- survfit(Surv(tempo, status, type = "right") ~ Desc_Renda_Bruta, 
    conf.type = "log", conf.int = 0.95, type = "kaplan-meier", error = "greenwood", 
    data = mes0619)
.Survfit
Call: survfit(formula = Surv(tempo, status, type = "right") ~ Desc_Renda_Bruta, 
    data = mes0619, error = "greenwood", conf.type = "log", conf.int = 0.95, 
    type = "kaplan-meier")

                                     n events median 0.95LCL 0.95UCL
Desc_Renda_Bruta=até 20%         11157   7189   5.82    5.70    6.02
Desc_Renda_Bruta=entre 20% a 30%  3743   1863   8.12    7.67    8.46
Desc_Renda_Bruta=entre 30% a 40%  2529    321     NA      NA      NA
Desc_Renda_Bruta=mais de 40%       550    136  14.61   10.76      NA
quantile(.Survfit, quantiles = c(0.25, 0.5, 0.75))
$quantile
                                        25        50       75
Desc_Renda_Bruta=até 20%          2.487671  5.819178 10.45753
Desc_Renda_Bruta=entre 20% a 30%  3.589041  8.115068 14.45479
Desc_Renda_Bruta=entre 30% a 40% 12.183562        NA       NA
Desc_Renda_Bruta=mais de 40%      7.306849 14.605479       NA

$lower
                                        25        50       75
Desc_Renda_Bruta=até 20%          2.369863  5.698630 10.25205
Desc_Renda_Bruta=entre 20% a 30%  3.290411  7.668493 13.84110
Desc_Renda_Bruta=entre 30% a 40% 11.142466        NA       NA
Desc_Renda_Bruta=mais de 40%      6.246575 10.761644       NA

$upper
                                        25       50       75
Desc_Renda_Bruta=até 20%          2.597260 6.019178 10.71507
Desc_Renda_Bruta=entre 20% a 30%  3.860274 8.457534 15.20274
Desc_Renda_Bruta=entre 30% a 40% 15.175342       NA       NA
Desc_Renda_Bruta=mais de 40%      8.917808       NA       NA
remove(.Survfit)
require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = mes0619$Qtde_dependentes))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = "right") ~ 
    z, .df)
.pval <- plyr::ddply(.df, plyr::.(), function(x) {
    data.frame(x = 0, y = 0, df = 2, chisq = survival::survdiff(survival::Surv(time = x, 
        event = y, type = "right") ~ z, x)$chisq)
})
.pval$label <- paste0("paste(italic(p), \" = ", signif(1 - pchisq(.pval$chisq, 
    .pval$df), 3), "\")")
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, nevent = .fit$n.event, 
    ncensor = .fit$n.censor, upper = .fit$upper, lower = .fit$lower)
.df <- .df[!duplicated(.df[, c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = NA, 
    ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) max(d$nrisk, 
    na.rm = TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {
    .df <- subset(.fit, x < 7 * i)
    .tmp2 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) if (all(is.na(d$nrisk))) 
        NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = TRUE))))
    .tmp2$x <- 7 * i
    .tmp2$Freq[is.na(.tmp2$Freq)] <- .tmp1$Freq[is.na(.tmp2$Freq)]
    .tmp1 <- .tmp2
    .nrisk <- rbind(.nrisk, .tmp2)
}
.nrisk$y <- rep(seq(0.125, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + geom_step(data = subset(.fit, 
    !is.na(upper)), aes(y = upper), size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, 
    na.rm = FALSE) + geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), 
    size = 1, lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + geom_step(size = 1.5) + 
    geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size = 1.5) + 
    geom_text(data = .pval, aes(y = y, x = x, label = label), colour = "black", 
        hjust = 0, vjust = -0.5, parse = TRUE, show.legend = FALSE, size = 14 * 
            0.282, family = "sans") + scale_x_continuous(breaks = seq(0, 21, 
    by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 1), expand = c(0.01, 
    0)) + scale_colour_brewer(palette = "Set1") + xlab("Time from entry") + 
    ylab("Proportion of survival") + labs(colour = "Qtde_dependentes") + theme_grey(base_size = 14, 
    base_family = "sans") + theme(legend.position = c(1, 1), legend.justification = c(1, 
    1), legend.background = element_rect(fill = "transparent"))
.nrisk$y <- ((.nrisk$y - 0.025)/(max(.nrisk$y) - 0.025) + 0.125) * 0.8
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = z)) + 
    geom_text(size = 14 * 0.282, family = "sans") + scale_x_continuous(breaks = seq(0, 
    21, by = 7), limits = c(0, 21)) + scale_y_continuous(limits = c(0, 1)) + 
    scale_colour_brewer(palette = "Set1") + ylab("Proportion of survival") + 
    RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z, 
    colour = z)) + geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
    scale_x_continuous(limits = c(-5, 5)) + scale_y_continuous(limits = c(0, 
    1)) + scale_colour_brewer(palette = "Set1") + RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 
    14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + geom_blank() + RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 
    14, "sans")
grid::grid.newpage()
grid::pushViewport(grid::viewport(layout = grid::grid.layout(2, 2, heights = unit(c(1, 
    3.5), c("null", "lines")), widths = unit(c(4, 1), c("lines", "null")))))
print(.plotb, vp = grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2))
print(.plot, vp = grid::viewport(layout.pos.row = 1, layout.pos.col = 1:2))
print(.plot2, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1:2))
print(.plot3, vp = grid::viewport(layout.pos.row = 2, layout.pos.col = 1))

.plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pval, .ppval, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, .plot3, 
    .plotb)
.Survfit <- survfit(Surv(tempo, status, type = "right") ~ Qtde_dependentes, 
    conf.type = "log", conf.int = 0.95, type = "kaplan-meier", error = "greenwood", 
    data = mes0619)
.Survfit
Call: survfit(formula = Surv(tempo, status, type = "right") ~ Qtde_dependentes, 
    data = mes0619, error = "greenwood", conf.type = "log", conf.int = 0.95, 
    type = "kaplan-meier")

                                  n events median 0.95LCL 0.95UCL
Qtde_dependentes=até dois      6884   3382   7.41    7.19    7.73
Qtde_dependentes=dois ou mais   543    209   6.78    5.85    8.96
Qtde_dependentes=Nenhum       10552   5918   7.14    6.99    7.27
quantile(.Survfit, quantiles = c(0.25, 0.5, 0.75))
$quantile
                                    25       50       75
Qtde_dependentes=até dois     3.383562 7.405479 13.43836
Qtde_dependentes=dois ou mais 2.934247 6.780822 12.45205
Qtde_dependentes=Nenhum       3.093151 7.136986 13.45479

$lower
                                    25       50       75
Qtde_dependentes=até dois     3.186301 7.191781 12.68767
Qtde_dependentes=dois ou mais 2.600000 5.852055 11.17534
Qtde_dependentes=Nenhum       2.958904 6.989041 13.09589

$upper
                                    25       50       75
Qtde_dependentes=até dois     3.558904 7.731507 14.11233
Qtde_dependentes=dois ou mais 3.375342 8.958904       NA
Qtde_dependentes=Nenhum       3.224658 7.271233 14.02466
remove(.Survfit)

Regressão Logística

indices <- sample(1:nrow(mes0619), floor(nrow(mes0619) * 0.8))
treino <- mes0619[indices, ]
teste <- mes0619[-indices, ]

GLM.1 <- glm(status ~ Qtde_dependentes + SEXO + Tipo_Regiao + Desc_Renda_Bruta + 
    id_ini_serv_pub, family = binomial(logit), data = treino)
summary(GLM.1)

Call:
glm(formula = status ~ Qtde_dependentes + SEXO + Tipo_Regiao + 
    Desc_Renda_Bruta + id_ini_serv_pub, family = binomial(logit), 
    data = treino)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7177  -1.1940   0.7611   0.9976   2.2620  

Coefficients:
                                   Estimate Std. Error z value Pr(>|z|)
(Intercept)                        0.212274   0.045813   4.633 3.60e-06
Qtde_dependentesdois ou mais      -0.296489   0.113041  -2.623 0.008720
Qtde_dependentesNenhum             0.145172   0.038914   3.731 0.000191
SEXOM                             -0.002498   0.038728  -0.064 0.948580
Tipo_RegiaoCidade_Média            0.226716   0.038393   5.905 3.52e-09
Tipo_RegiaoCidade_Pequena          0.349205   0.068397   5.106 3.30e-07
Desc_Renda_Brutaentre 20% a 30%   -0.545131   0.043284 -12.594  < 2e-16
Desc_Renda_Brutaentre 30% a 40%   -2.390953   0.069970 -34.171  < 2e-16
Desc_Renda_Brutamais de 40%       -1.685908   0.115851 -14.552  < 2e-16
id_ini_serv_pubentre 30 e 40 anos  0.508986   0.042291  12.035  < 2e-16
id_ini_serv_pubmais de 40 anos     0.448215   0.070502   6.357 2.05e-10
id_ini_serv_pubmenos de 20 anos   -0.271858   0.074741  -3.637 0.000275
                                     
(Intercept)                       ***
Qtde_dependentesdois ou mais      ** 
Qtde_dependentesNenhum            ***
SEXOM                                
Tipo_RegiaoCidade_Média           ***
Tipo_RegiaoCidade_Pequena         ***
Desc_Renda_Brutaentre 20% a 30%   ***
Desc_Renda_Brutaentre 30% a 40%   ***
Desc_Renda_Brutamais de 40%       ***
id_ini_serv_pubentre 30 e 40 anos ***
id_ini_serv_pubmais de 40 anos    ***
id_ini_serv_pubmenos de 20 anos   ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 19887  on 14382  degrees of freedom
Residual deviance: 17562  on 14371  degrees of freedom
AIC: 17586

Number of Fisher Scoring iterations: 4
exp(coef(GLM.1))
                      (Intercept)      Qtde_dependentesdois ou mais 
                       1.23648676                        0.74342387 
           Qtde_dependentesNenhum                             SEXOM 
                       1.15623874                        0.99750550 
          Tipo_RegiaoCidade_Média         Tipo_RegiaoCidade_Pequena 
                       1.25447390                        1.41793977 
  Desc_Renda_Brutaentre 20% a 30%   Desc_Renda_Brutaentre 30% a 40% 
                       0.57976605                        0.09154237 
      Desc_Renda_Brutamais de 40% id_ini_serv_pubentre 30 e 40 anos 
                       0.18527603                        1.66360325 
   id_ini_serv_pubmais de 40 anos   id_ini_serv_pubmenos de 20 anos 
                       1.56551583                        0.76196259 
modfinal <- step(GLM.1, scope = list(lower = ~1))
Start:  AIC=17586.22
status ~ Qtde_dependentes + SEXO + Tipo_Regiao + Desc_Renda_Bruta + 
    id_ini_serv_pub

                   Df Deviance   AIC
- SEXO              1    17562 17584
<none>                   17562 17586
- Qtde_dependentes  2    17588 17608
- Tipo_Regiao       2    17611 17631
- id_ini_serv_pub   3    17760 17778
- Desc_Renda_Bruta  3    19374 19392

Step:  AIC=17584.22
status ~ Qtde_dependentes + Tipo_Regiao + Desc_Renda_Bruta + 
    id_ini_serv_pub

                   Df Deviance   AIC
<none>                   17562 17584
- Qtde_dependentes  2    17589 17607
- Tipo_Regiao       2    17611 17629
- id_ini_serv_pub   3    17769 17785
- Desc_Renda_Bruta  3    19378 19394
summary(modfinal)

Call:
glm(formula = status ~ Qtde_dependentes + Tipo_Regiao + Desc_Renda_Bruta + 
    id_ini_serv_pub, family = binomial(logit), data = treino)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7172  -1.1937   0.7607   0.9980   2.2616  

Coefficients:
                                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)                        0.21105    0.04172   5.059 4.22e-07 ***
Qtde_dependentesdois ou mais      -0.29675    0.11297  -2.627 0.008617 ** 
Qtde_dependentesNenhum             0.14574    0.03790   3.845 0.000120 ***
Tipo_RegiaoCidade_Média            0.22668    0.03839   5.905 3.53e-09 ***
Tipo_RegiaoCidade_Pequena          0.34905    0.06835   5.106 3.28e-07 ***
Desc_Renda_Brutaentre 20% a 30%   -0.54512    0.04328 -12.594  < 2e-16 ***
Desc_Renda_Brutaentre 30% a 40%   -2.39105    0.06996 -34.180  < 2e-16 ***
Desc_Renda_Brutamais de 40%       -1.68653    0.11545 -14.608  < 2e-16 ***
id_ini_serv_pubentre 30 e 40 anos  0.50857    0.04180  12.167  < 2e-16 ***
id_ini_serv_pubmais de 40 anos     0.44743    0.06944   6.444 1.17e-10 ***
id_ini_serv_pubmenos de 20 anos   -0.27159    0.07462  -3.639 0.000273 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 19887  on 14382  degrees of freedom
Residual deviance: 17562  on 14372  degrees of freedom
AIC: 17584

Number of Fisher Scoring iterations: 4
tabela <- table(GLM.1$fitted.values > 0.55, treino$status)
tabela %>% kable()
0 1
FALSE 3512 1585
TRUE 3249 6037
sum(diag(tabela))/sum(tabela)
[1] 0.6639088
result <- predict(modfinal, treino, "response")
pred <- prediction(result, treino$status)
perf <- performance(pred, "fpr", "fnr")
plot(perf, print.cutoffs.at = seq(0.1, 0.9, by = 0.05))

---
title: "Diagnóstico Sobre a Aposentadoria no Serviço Público"
subtitle: "Aplicação de Regressão Logística"
date: "`r Sys.Date()`"
output:
    html_document:
      code_folding: hide
      code_download: true
---
```{r}
knitr::opts_chunk$set(cache=FALSE, message=FALSE, warning=FALSE, prompt=FALSE,      tidy=TRUE, comment=NA)
```



```{r knitr_init, echo=FALSE}


library(knitr)
library(rmdformats)
library(tidyverse)
library(knitr)
library(readxl)
library(lubridate)
library(rpivotTable)
library(data.table)
library(abind, pos=38)
library(e1071, pos=39)
library(car)
library(RcmdrMisc)
library(survival)
library(ggfortify)
library(ggplot2)
library(ROCR)
```

## Introdução

No serviço público é comum que alguns servidores não aposentam na data em que começa a ter o direito a aposentadoria e permanecem no trabalho até o tempo da idade compulsória recebendo um abono. Dessa forma, o objetivo do estudo consiste na exploração da teoria de análise de regressão logística para delinear o perfil dos servidores que adquirem o direito ao abono salarial e decidem se continuam trabalhando = 0 ou não = 1, situação que será a variável resposta de interesse neste caso.


## Base de Dados
Para subsidiar este estudo, foram coletadas informações do Data Warehouse do Sistema Integrado de Administração de Pessoal - DW SIAPE dos servidores públicos do Ministério da Economia - ME, ativos e inativos no período de janeiro de 2019 a julho de 2019.

As informações extraídas foram:

• Nome • Sexo • CPF (chave primária) • Matricula SIAPE • CLASSE_CARGO • PADRAO_CARGO • SITUACAO_VINCULO • REGIME_JURIDICO • JORNADA_DE_TRABALHO • Remuneração • Ano/Mês inicial do abono de permanência • Valor Abono (caso tenha) • Todas as gratificações e funções comissionadas • Funções – VPNI • Quantidade de dependentes • Descrição do cargo emprego • Nível de Escolaridade • Denominação do órgão de atuação • UF da UPAG de vinculação • Denominação unidade organizacional • ORGSUP_EXERCICIO • UF da Residência • Cidade da residência • Situação servidor • Tipo DE Aposentadoria • Data ingresso no serviço público • Data de nascimento • DATA ocorrência inatividade


### Descritivas dos dados

Inicialmente, selecionou-se o último mês desta base de dados, junho de 2019, e buscou os tempos iniciais para o estudo, ou seja, a data de início do abono e, para os casos da ocorrência do evento de aposentadorias, buscou-se a data deste fato. 

```{r}

mes0619 <- read.csv2("dados.csv")
mes0619 %>% DT::datatable()

```


```{r}
summary(mes0619[,-c(1,2)]) 

```



## Exploração dos Dados de Sobrevivência - Estimador Kaplan-Meier


A partir do Estimador de Kaplan-Meier é possível observar o tempo mediano de 7.66 anos que os servidores em abono salarial permanecem no trabalho atÃ© decidirem aposentarem.


```{r}
require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = factor("At risk")))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = "right") ~ z, .df)
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, nevent = .fit$n.event, ncensor= 
  .fit$n.censor, upper = .fit$upper, lower = .fit$lower)
.df <- .df[!duplicated(.df[,c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.med <- plyr::ddply(.fit, plyr::.(z), function(x) {
data.frame(
 median = min(subset(x, y < (0.5 + .Machine$double.eps^0.5))$x)
)})
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = NA, ncensor = NA, upper = 1, 
  lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) max(d$nrisk, na.rm = 
  TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {.df <- subset(.fit, x < 7 * i); .tmp2 <- data.frame(as.table(by(.df, .df[, c("z"), drop 
  = FALSE], function(d) if (all(is.na(d$nrisk))) NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = 
  TRUE)))); .tmp2$x <- 7 * i; .tmp2$Freq[is.na(.tmp2$Freq)] <- .tmp1$Freq[is.na(.tmp2$Freq)]; .tmp1 <- 
  .tmp2; .nrisk <- rbind(.nrisk, .tmp2)}
.nrisk$y <- rep(seq(0.025, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + 
  geom_step(data = subset(.fit, !is.na(upper)), aes(y = upper), size = 1, lty = 2, alpha = 0.5, 
  show.legend = FALSE, na.rm = FALSE) + 
  geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), size = 1, lty = 2, alpha = 0.5, 
  show.legend = FALSE, na.rm = FALSE) + 
  geom_step(size = 1.5) + 
  geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size = 1.5) + 
  geom_vline(data = .med, aes(xintercept = median), colour = "black", lty = 2) + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1), expand = c(0.01, 0)) + 
  scale_colour_brewer(palette = "Set1") + 
  xlab("Tempo em anos de abono") + 
  ylab("Probabilidade S(t)") + 
  labs(title = "Estimador de Kaplan-Meier") + 
  theme_grey(base_size = 14, base_family = "sans") + 
  theme(legend.position = "none")
.nrisk$y <- 0.5
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = z)) + 
  geom_text(size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  ylab("Tempo em anos de abono") + 
  RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z, colour = z)) + 
  geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(limits = c(-5, 5)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + 
  geom_blank() + 
  RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 14, "sans")
grid::grid.newpage(); grid::pushViewport(grid::viewport(layout = grid::grid.layout(2, 2, heights = 
  unit(c(1, 2.5), c("null", "lines")), widths  = unit(c(4, 1), c("lines", "null"))))); print(.plotb, vp = 
  grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2)); print(.plot , vp = 
  grid::viewport(layout.pos.row = 1  , layout.pos.col = 1:2)); print(.plot2, vp = 
  grid::viewport(layout.pos.row = 2  , layout.pos.col = 1:2)); print(.plot3, vp = 
  grid::viewport(layout.pos.row = 2  , layout.pos.col = 1  )); .plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pmed, .med, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, .plot3, .plotb)
```


```{r}
require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = 
  mes0619$SEXO))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = 
  "right") ~ z, .df)
.pval <- plyr::ddply(.df, plyr::.(),
 function(x) {
  data.frame(
   x = 0, y = 0, df = 1,
   chisq = survival::survdiff(
    survival::Surv(time = x, event = y, type = "right") ~ z, x
   )$chisq
)})
.pval$label <- paste0(
  "paste(italic(p), \" = ",
  signif(1 - pchisq(.pval$chisq, .pval$df), 3),
  "\")"
)
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, 
  nevent = .fit$n.event, ncensor= .fit$n.censor, upper = .fit$upper, lower = 
  .fit$lower)
.df <- .df[!duplicated(.df[,c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = 
  NA, ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], 
  function(d) max(d$nrisk, na.rm = TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {.df <- subset(.fit, x < 7 * i); .tmp2 <- 
  data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) if 
  (all(is.na(d$nrisk))) NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = 
  TRUE)))); .tmp2$x <- 7 * i; .tmp2$Freq[is.na(.tmp2$Freq)] <- 
  .tmp1$Freq[is.na(.tmp2$Freq)]; .tmp1 <- .tmp2; .nrisk <- rbind(.nrisk, 
  .tmp2)}
.nrisk$y <- rep(seq(0.075, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + 
  geom_step(data = subset(.fit, !is.na(upper)), aes(y = upper), size = 1, 
  lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + 
  geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), size = 1, 
  lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + 
  geom_step(size = 1.5) + 
  geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size 
  = 1.5) + 
  geom_text(data = .pval, aes(y = y, x = x, label = label), colour = 
  "black", hjust = 0, vjust = -0.5, parse = TRUE, show.legend = FALSE, size = 
  14 * 0.282, family = "sans") + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1), expand = c(0.01, 0)) + 
  scale_colour_brewer(palette = "Set1") + 
  xlab("Tempo em anos de abono") + 
  ylab("Probabidade S(t)") + 
  labs(colour = "SEXO") + 
  labs(title = "Estimador de Kaplan-Meier") + 
  theme_grey(base_size = 14, base_family = "sans") + 
  theme(legend.position = c(1, 1), legend.justification = c(1, 1), 
  legend.background = element_rect(fill = "transparent"))
.nrisk$y <- ((.nrisk$y - 0.025) / (max(.nrisk$y) - 0.025) + 0.5) * 0.5
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = 
  z)) + 
  geom_text(size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  ylab("Probabidade S(t)") + 
  RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z,
   colour = z)) + 
  geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(limits = c(-5, 5)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + 
  geom_blank() + 
  RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 14, "sans")
grid::grid.newpage(); grid::pushViewport(grid::viewport(layout = 
  grid::grid.layout(2, 2, heights = unit(c(1, 3), c("null", "lines")), widths 
   = unit(c(4, 1), c("lines", "null"))))); print(.plotb, vp = 
  grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2)); print(.plot , 
  vp = grid::viewport(layout.pos.row = 1  , layout.pos.col = 1:2)); 
  print(.plot2, vp = grid::viewport(layout.pos.row = 2  , layout.pos.col = 
  1:2)); print(.plot3, vp = grid::viewport(layout.pos.row = 2  , 
  layout.pos.col = 1  )); .plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pval, .ppval, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, 
  .plot3, .plotb)
```


```{r}
.Survfit <- survfit(Surv(tempo, status, type="right") ~ SEXO, 
  conf.type="log", conf.int=0.95, type="kaplan-meier", error="greenwood", 
  data=mes0619)
.Survfit

quantile(.Survfit, quantiles=c(.25,.5,.75))
remove(.Survfit)

```

```{r}

require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = 
  mes0619$id_ini_serv_pub))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = 
  "right") ~ z, .df)
.pval <- plyr::ddply(.df, plyr::.(),
 function(x) {
  data.frame(
   x = 0, y = 0, df = 3,
   chisq = survival::survdiff(
    survival::Surv(time = x, event = y, type = "right") ~ z, x
   )$chisq
)})
.pval$label <- paste0(
  "paste(italic(p), \" = ",
  signif(1 - pchisq(.pval$chisq, .pval$df), 3),
  "\")"
)
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, 
  nevent = .fit$n.event, ncensor= .fit$n.censor, upper = .fit$upper, lower = 
  .fit$lower)
.df <- .df[!duplicated(.df[,c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = 
  NA, ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], 
  function(d) max(d$nrisk, na.rm = TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {.df <- subset(.fit, x < 7 * i); .tmp2 <- 
  data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) if 
  (all(is.na(d$nrisk))) NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = 
  TRUE)))); .tmp2$x <- 7 * i; .tmp2$Freq[is.na(.tmp2$Freq)] <- 
  .tmp1$Freq[is.na(.tmp2$Freq)]; .tmp1 <- .tmp2; .nrisk <- rbind(.nrisk, 
  .tmp2)}
.nrisk$y <- rep(seq(0.175, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + 
  geom_step(data = subset(.fit, !is.na(upper)), aes(y = upper), size = 1, 
  lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + 
  geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), size = 1, 
  lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + 
  geom_step(size = 1.5) + 
  geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size 
  = 1.5) + 
  geom_text(data = .pval, aes(y = y, x = x, label = label), colour = 
  "black", hjust = 0, vjust = -0.5, parse = TRUE, show.legend = FALSE, size = 
  14 * 0.282, family = "sans") + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1), expand = c(0.01, 0)) + 
  scale_colour_brewer(palette = "Set1") + 
  xlab("Tempo em anos de abono") + 
  ylab("Probabilidade de S(t)") + 
  labs(colour = "id_ini_serv_pub") + 
  labs(title = "Estimador de Kaplan-Meier") + 
  theme_grey(base_size = 14, base_family = "sans") + 
  theme(legend.position = c(1, 1), legend.justification = c(1, 1), 
  legend.background = element_rect(fill = "transparent"))
.nrisk$y <- ((.nrisk$y - 0.025) / (max(.nrisk$y) - 0.025) + 0.125) * 0.8
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = 
  z)) + 
  geom_text(size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  ylab("Probabilidade de S(t)") + 
  RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z,
   colour = z)) + 
  geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(limits = c(-5, 5)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + 
  geom_blank() + 
  RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 14, "sans")
grid::grid.newpage(); grid::pushViewport(grid::viewport(layout = 
  grid::grid.layout(2, 2, heights = unit(c(1, 4), c("null", "lines")), widths 
   = unit(c(4, 1), c("lines", "null"))))); print(.plotb, vp = 
  grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2)); print(.plot , 
  vp = grid::viewport(layout.pos.row = 1  , layout.pos.col = 1:2)); 
  print(.plot2, vp = grid::viewport(layout.pos.row = 2  , layout.pos.col = 
  1:2)); print(.plot3, vp = grid::viewport(layout.pos.row = 2  , 
  layout.pos.col = 1  )); .plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pval, .ppval, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, 
  .plot3, .plotb)
```

```{r}
.Survfit <- survfit(Surv(tempo, status, type="right") ~ id_ini_serv_pub, 
  conf.type="log", conf.int=0.95, type="kaplan-meier", error="greenwood", 
  data=mes0619)
.Survfit

quantile(.Survfit, quantiles=c(.25,.5,.75)) 
remove(.Survfit)

```

```{r}
require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = 
  mes0619$Tipo_Regiao))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = 
  "right") ~ z, .df)
.pval <- plyr::ddply(.df, plyr::.(),
 function(x) {
  data.frame(
   x = 0, y = 0, df = 2,
   chisq = survival::survdiff(
    survival::Surv(time = x, event = y, type = "right") ~ z, x
   )$chisq
)})
.pval$label <- paste0(
  "paste(italic(p), \" = ",
  signif(1 - pchisq(.pval$chisq, .pval$df), 3),
  "\")"
)
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, 
  nevent = .fit$n.event, ncensor= .fit$n.censor, upper = .fit$upper, lower = 
  .fit$lower)
.df <- .df[!duplicated(.df[,c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = 
  NA, ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], 
  function(d) max(d$nrisk, na.rm = TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {.df <- subset(.fit, x < 7 * i); .tmp2 <- 
  data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) if 
  (all(is.na(d$nrisk))) NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = 
  TRUE)))); .tmp2$x <- 7 * i; .tmp2$Freq[is.na(.tmp2$Freq)] <- 
  .tmp1$Freq[is.na(.tmp2$Freq)]; .tmp1 <- .tmp2; .nrisk <- rbind(.nrisk, 
  .tmp2)}
.nrisk$y <- rep(seq(0.125, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + 
  geom_step(data = subset(.fit, !is.na(upper)), aes(y = upper), size = 1, 
  lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + 
  geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), size = 1, 
  lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + 
  geom_step(size = 1.5) + 
  geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size 
  = 1.5) + 
  geom_text(data = .pval, aes(y = y, x = x, label = label), colour = 
  "black", hjust = 0, vjust = -0.5, parse = TRUE, show.legend = FALSE, size = 
  14 * 0.282, family = "sans") + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1), expand = c(0.01, 0)) + 
  scale_colour_brewer(palette = "Set1") + 
  xlab("Tempo em anos de abono") + 
  ylab("Propabilidade de S(t)") + 
  labs(colour = "Tipo_Regiao") + 
  labs(title = "Estimador de Kaplan-Meier") + 
  theme_grey(base_size = 14, base_family = "sans") + 
  theme(legend.position = c(1, 1), legend.justification = c(1, 1), 
  legend.background = element_rect(fill = "transparent"))
.nrisk$y <- ((.nrisk$y - 0.025) / (max(.nrisk$y) - 0.025) + 0.125) * 0.8
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = 
  z)) + 
  geom_text(size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  ylab("Propabilidade de S(t)") + 
  RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z,
   colour = z)) + 
  geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(limits = c(-5, 5)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + 
  geom_blank() + 
  RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 14, "sans")
grid::grid.newpage(); grid::pushViewport(grid::viewport(layout = 
  grid::grid.layout(2, 2, heights = unit(c(1, 3.5), c("null", "lines")), 
  widths  = unit(c(4, 1), c("lines", "null"))))); print(.plotb, vp = 
  grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2)); print(.plot , 
  vp = grid::viewport(layout.pos.row = 1  , layout.pos.col = 1:2)); 
  print(.plot2, vp = grid::viewport(layout.pos.row = 2  , layout.pos.col = 
  1:2)); print(.plot3, vp = grid::viewport(layout.pos.row = 2  , 
  layout.pos.col = 1  )); .plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pval, .ppval, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, 
  .plot3, .plotb)
```

```{r}
.Survfit <- survfit(Surv(tempo, status, type="right") ~ Tipo_Regiao, 
  conf.type="log", conf.int=0.95, type="kaplan-meier", error="greenwood", 
  data=mes0619)
.Survfit

quantile(.Survfit, quantiles=c(.25,.5,.75)) 
remove(.Survfit)
```


```{r}
require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = 
  mes0619$Desc_Renda_Bruta))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = 
  "right") ~ z, .df)
.pval <- plyr::ddply(.df, plyr::.(),
 function(x) {
  data.frame(
   x = 0, y = 0, df = 3,
   chisq = survival::survdiff(
    survival::Surv(time = x, event = y, type = "right") ~ z, x
   )$chisq
)})
.pval$label <- paste0(
  "paste(italic(p), \" = ",
  signif(1 - pchisq(.pval$chisq, .pval$df), 3),
  "\")"
)
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, 
  nevent = .fit$n.event, ncensor= .fit$n.censor, upper = .fit$upper, lower = 
  .fit$lower)
.df <- .df[!duplicated(.df[,c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = 
  NA, ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], 
  function(d) max(d$nrisk, na.rm = TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {.df <- subset(.fit, x < 7 * i); .tmp2 <- 
  data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) if 
  (all(is.na(d$nrisk))) NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = 
  TRUE)))); .tmp2$x <- 7 * i; .tmp2$Freq[is.na(.tmp2$Freq)] <- 
  .tmp1$Freq[is.na(.tmp2$Freq)]; .tmp1 <- .tmp2; .nrisk <- rbind(.nrisk, 
  .tmp2)}
.nrisk$y <- rep(seq(0.175, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + 
  geom_step(data = subset(.fit, !is.na(upper)), aes(y = upper), size = 1, 
  lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + 
  geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), size = 1, 
  lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + 
  geom_step(size = 1.5) + 
  geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size 
  = 1.5) + 
  geom_text(data = .pval, aes(y = y, x = x, label = label), colour = 
  "black", hjust = 0, vjust = -0.5, parse = TRUE, show.legend = FALSE, size = 
  14 * 0.282, family = "sans") + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1), expand = c(0.01, 0)) + 
  scale_colour_brewer(palette = "Set1") + 
  xlab("Time from entry") + 
  ylab("Proportion of survival") + 
  labs(colour = "Desc_Renda_Bruta") + 
  theme_grey(base_size = 14, base_family = "sans") + 
  theme(legend.position = c(1, 1), legend.justification = c(1, 1), 
  legend.background = element_rect(fill = "transparent"))
.nrisk$y <- ((.nrisk$y - 0.025) / (max(.nrisk$y) - 0.025) + 0.125) * 0.8
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = 
  z)) + 
  geom_text(size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  ylab("Proportion of survival") + 
  RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z,
   colour = z)) + 
  geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(limits = c(-5, 5)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + 
  geom_blank() + 
  RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 14, "sans")
grid::grid.newpage(); grid::pushViewport(grid::viewport(layout = 
  grid::grid.layout(2, 2, heights = unit(c(1, 4), c("null", "lines")), widths 
   = unit(c(4, 1), c("lines", "null"))))); print(.plotb, vp = 
  grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2)); print(.plot , 
  vp = grid::viewport(layout.pos.row = 1  , layout.pos.col = 1:2)); 
  print(.plot2, vp = grid::viewport(layout.pos.row = 2  , layout.pos.col = 
  1:2)); print(.plot3, vp = grid::viewport(layout.pos.row = 2  , 
  layout.pos.col = 1  )); .plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pval, .ppval, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, 
  .plot3, .plotb)
```



```{r}
.Survfit <- survfit(Surv(tempo, status, type="right") ~ Desc_Renda_Bruta, 
  conf.type="log", conf.int=0.95, type="kaplan-meier", error="greenwood", 
  data=mes0619)
.Survfit

quantile(.Survfit, quantiles=c(.25,.5,.75))
remove(.Survfit)
```


```{r}
require("ggplot2")
.df <- na.omit(data.frame(x = mes0619$tempo, y = mes0619$status, z = 
  mes0619$Qtde_dependentes))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = 
  "right") ~ z, .df)
.pval <- plyr::ddply(.df, plyr::.(),
 function(x) {
  data.frame(
   x = 0, y = 0, df = 2,
   chisq = survival::survdiff(
    survival::Surv(time = x, event = y, type = "right") ~ z, x
   )$chisq
)})
.pval$label <- paste0(
  "paste(italic(p), \" = ",
  signif(1 - pchisq(.pval$chisq, .pval$df), 3),
  "\")"
)
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, 
  nevent = .fit$n.event, ncensor= .fit$n.censor, upper = .fit$upper, lower = 
  .fit$lower)
.df <- .df[!duplicated(.df[,c("x", "z")]), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = 
  NA, ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1)
.tmp1 <- data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], 
  function(d) max(d$nrisk, na.rm = TRUE))))
.tmp1$x <- 0
.nrisk <- .tmp1
for (i in 1:3) {.df <- subset(.fit, x < 7 * i); .tmp2 <- 
  data.frame(as.table(by(.df, .df[, c("z"), drop = FALSE], function(d) if 
  (all(is.na(d$nrisk))) NA else min(d$nrisk - d$nevent - d$ncensor, na.rm = 
  TRUE)))); .tmp2$x <- 7 * i; .tmp2$Freq[is.na(.tmp2$Freq)] <- 
  .tmp1$Freq[is.na(.tmp2$Freq)]; .tmp1 <- .tmp2; .nrisk <- rbind(.nrisk, 
  .tmp2)}
.nrisk$y <- rep(seq(0.125, 0.025, -0.05), 4)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + 
  geom_step(data = subset(.fit, !is.na(upper)), aes(y = upper), size = 1, 
  lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + 
  geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), size = 1, 
  lty = 2, alpha = 0.5, show.legend = FALSE, na.rm = FALSE) + 
  geom_step(size = 1.5) + 
  geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size 
  = 1.5) + 
  geom_text(data = .pval, aes(y = y, x = x, label = label), colour = 
  "black", hjust = 0, vjust = -0.5, parse = TRUE, show.legend = FALSE, size = 
  14 * 0.282, family = "sans") + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1), expand = c(0.01, 0)) + 
  scale_colour_brewer(palette = "Set1") + 
  xlab("Time from entry") + 
  ylab("Proportion of survival") + 
  labs(colour = "Qtde_dependentes") + 
  theme_grey(base_size = 14, base_family = "sans") + 
  theme(legend.position = c(1, 1), legend.justification = c(1, 1), 
  legend.background = element_rect(fill = "transparent"))
.nrisk$y <- ((.nrisk$y - 0.025) / (max(.nrisk$y) - 0.025) + 0.125) * 0.8
.plot2 <- ggplot(data = .nrisk, aes(x = x, y = y, label = Freq, colour = 
  z)) + 
  geom_text(size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(breaks = seq(0, 21, by = 7), limits = c(0, 21)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  ylab("Proportion of survival") + 
  RcmdrPlugin.KMggplot2::theme_natrisk(theme_grey, 14, "sans")
.plot3 <- ggplot(data = subset(.nrisk, x == 0), aes(x = x, y = y, label = z,
   colour = z)) + 
  geom_text(hjust = 0, size = 14 * 0.282, family = "sans") + 
  scale_x_continuous(limits = c(-5, 5)) + 
  scale_y_continuous(limits = c(0, 1)) + 
  scale_colour_brewer(palette = "Set1") + 
  RcmdrPlugin.KMggplot2::theme_natrisk21(theme_grey, 14, "sans")
.plotb <- ggplot(.df, aes(x = x, y = y)) + 
  geom_blank() + 
  RcmdrPlugin.KMggplot2::theme_natriskbg(theme_grey, 14, "sans")
grid::grid.newpage(); grid::pushViewport(grid::viewport(layout = 
  grid::grid.layout(2, 2, heights = unit(c(1, 3.5), c("null", "lines")), 
  widths  = unit(c(4, 1), c("lines", "null"))))); print(.plotb, vp = 
  grid::viewport(layout.pos.row = 1:2, layout.pos.col = 1:2)); print(.plot , 
  vp = grid::viewport(layout.pos.row = 1  , layout.pos.col = 1:2)); 
  print(.plot2, vp = grid::viewport(layout.pos.row = 2  , layout.pos.col = 
  1:2)); print(.plot3, vp = grid::viewport(layout.pos.row = 2  , 
  layout.pos.col = 1  )); .plot <- recordPlot()
print(.plot)
rm(.df, .fit, .pval, .ppval, .cens, .tmp1, .tmp2, .nrisk, .plot, .plot2, 
  .plot3, .plotb)
```

```{r}
.Survfit <- survfit(Surv(tempo, status, type="right") ~ Qtde_dependentes, 
  conf.type="log", conf.int=0.95, type="kaplan-meier", error="greenwood", 
  data=mes0619)
.Survfit

quantile(.Survfit, quantiles=c(.25,.5,.75)) 
remove(.Survfit)
```


## Regressão Logística

```{r}
indices <- sample(1:nrow(mes0619),floor(nrow(mes0619)*.8))
treino <- mes0619[indices,]
teste <- mes0619[-indices,]

GLM.1 <- glm(status ~ Qtde_dependentes + SEXO + Tipo_Regiao + Desc_Renda_Bruta + id_ini_serv_pub, 
             family=binomial(logit), data= treino)
summary(GLM.1)
exp(coef(GLM.1))
```
```{r}
modfinal <- step(GLM.1,scope = list(lower=~1))
summary(modfinal)

```

```{r}
tabela <- table(GLM.1$fitted.values>.55,treino$status)
tabela %>% kable()
sum(diag(tabela))/sum(tabela)
```

```{r}
result <- predict(modfinal,treino,"response")
pred <- prediction(result,treino$status)
perf <- performance(pred,"fpr","fnr")
plot(perf,print.cutoffs.at=seq(.1,.9,by=.05))
```

